knitr::opts_chunk$set(fig.width=12, fig.height=8)
library(tidyverse)
library(descr)
library(Amelia)
library(lubridate)
library(ggmap)
library(quantreg)
library(shapefiles)
library(leaflet)
library(stringi)
library(ngram)
library(qdap)
library(rgdal)
library(plotly)
library(data.table)
business <- read_csv("yelp_toronto_business.csv")
review <- read_csv("yelp_toronto_review.csv")
Our focus will be on the business data, but we will use the review data to get some insights about businesses and will do some feature engineering to create useful variables for the model.
glimpse(review)
Observations: 474,803
Variables: 9
$ review_id <chr> "kS4hrhEScwB9V5JATYjvVQ", "YDJDfKnx6VpMMo4EBxycGg", "2Hk7DNwu3rb2j...
$ user_id <chr> "hxqo4NyJFfeOmuoVi--s1A", "FCtoTo9zSH1cSAkascfEHw", "YHWsLBS8jzZiP...
$ business_id <chr> "f5O7v_X_jCg2itqacRfxhg", "7xA6iSP0Ndn08tpBFQtUKA", "SmizR7MLt-558...
$ stars <int> 5, 1, 4, 4, 2, 3, 2, 3, 4, 4, 2, 5, 4, 5, 4, 1, 1, 4, 3, 3, 3, 4, ...
$ date <date> 2017-10-12, 2017-05-22, 2011-06-01, 2011-11-07, 2011-08-20, 2011-...
$ text <chr> "Sansotei serves some top notch ramen. They take no reservations, ...
$ useful <int> 0, 0, 1, 4, 0, 2, 9, 3, 2, 5, 8, 16, 0, 7, 8, 1, 5, 1, 3, 2, 2, 2,...
$ funny <int> 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 0, 1, ...
$ cool <int> 0, 0, 1, 0, 0, 1, 4, 2, 1, 3, 1, 9, 0, 0, 3, 0, 1, 2, 2, 1, 2, 1, ...
# Are there any missing values?
missmap(review, col = c('yellow','darkgreen'), main = 'Missing values vs observed')
the condition has length > 1 and only the first element will be usedUnknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'arguments'.Unknown or uninitialised column: 'imputations'.
Sounds good, there are No missing values in the review dataset. Later we will check if some variable will help for building our model.
review <- review %>%
mutate(word_count = str_count(review$text, "\\w+")) %>%
group_by(business_id, word_count) %>%
arrange(desc(word_count))
glimpse(business)
Observations: 18,233
Variables: 13
$ address <chr> "631 Bloor St W", "595 Markham Street", "746 Street Clair Avenue ...
$ business_id <chr> "9A2quhZLyWk0akUetBd8hQ", "tZnSodhPwNr4bzrwJ1CSbw", "5J3b7j3Fzo9I...
$ categories <chr> "Food, Bakeries", "Cajun/Creole, Southern, Restaurants", "Food, B...
$ city <chr> "Toronto", "Toronto", "Toronto", "Toronto", "Toronto", "Toronto",...
$ is_open <int> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,...
$ latitude <dbl> 43.66438, 43.66412, 43.68133, 43.67088, 43.73340, 43.66196, 43.66...
$ longitude <dbl> -79.41442, -79.41189, -79.42788, -79.39238, -79.22421, -79.39126,...
$ name <chr> "Bnc Cake House", "Southern Accent Restaurant", "Mabel's Bakery",...
$ neighborhood <chr> "Koreatown", "Palmerston", "Wychwood", "Yorkville", "Scarborough"...
$ postal_code <chr> "M6G 1K8", "M6G 2L7", "M6C 1B5", "M5R 3K5", "M1M 1P8", "M5S", "M5...
$ review_count <int> 7, 146, 23, 25, 3, 3, 3, 105, 3, 31, 9, 3, 51, 5, 6, 4, 15, 38, 4...
$ stars <dbl> 4.0, 4.0, 4.0, 3.5, 2.0, 4.5, 4.5, 4.0, 3.5, 4.0, 4.0, 1.0, 3.5, ...
$ state <chr> "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON",...
missing <- data.table(pmiss = sapply(business, function(x) { (sum(is.na(x)) / length(x)) }),
column = names(business))
p <- ggplot(missing,aes(x = reorder(column, -pmiss), y = pmiss)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title='Missing data by feature',
x='Feature',
y='% missing')
ggplotly(p)
We have that summary for the business data: 18,233 Observations and 13 Variables
$ address character class with 14422 unique values and 283 missing values
$ business_id character class with 18233 unique values and 0 missing values
$ categories character class with 10028 unique values and 33 missing values
$ city character class with 1 unique values and 0 missing values
$ is_open integer class with 2 unique values and 0 missing values
$ latitude numeric class with 15366 unique values and 1 missing values
$ longitude numeric class with 15315 unique values and 1 missing values
$ name character class with 15292 unique values and 0 missing values
$ neighborhood character class with 80 unique values and 3435 missing values
$ postal_code character class with 5261 unique values and 117 missing values
$ review_count integer class with 380 unique values and 0 missing values
$ stars numeric class with 9 unique values and 0 missing values
$ state character class with 2 unique values and 0 missing values
# correcting coordinates
business[which(business$address == "2138 Queen Street E"),] <- business %>%
filter(business$address == "2138 Queen Street E") %>%
mutate(longitude = -79.293425, latitude = 43.671584)
#adding coordinates for the postal code M5H 4G1 (lon: -79.3854, lat: 43.6508)
business[which(is.na(business$longitude) == TRUE),] <- business %>%
filter(is.na(longitude) == TRUE) %>%
mutate(longitude = -79.3854, latitude = 43.6508)
# converting "is_open" attribute from integer to factor and change values to yes or no
#lowering some character variable, converting "stars" from int to factor
business <- business %>%
mutate(is_open = as_factor(if_else(as.logical(is_open) == TRUE, "yes", "no")),
categories = str_to_lower(categories),
name = str_to_lower(name),
neighborhood = str_to_lower(neighborhood),
stars =ordered(as_factor(as.character(stars)), levels = c("1", "1.5", "2", "2.5", "3", "3.5", "4", "4.5", "5"))
)
Let check the distribution in the target variable. How many businesses are open? How many are closed?
p <- ggplot(business, aes(x = is_open)) +
geom_bar(aes(fill = is_open))
ggplotly(p)
14023 businesses are open (77%) and 4210 closed (23%). Our target variable is imbalanced.
# star rating distribution
p <- ggplot(business, mapping = aes(stars)) +
geom_bar(aes(fill = stars)) +
labs(title ="Star rating distribution")
ggplotly(p)
Around 70% of the businesses got a rating between 3 and 4.5
# The distribution of number of reviews
#we have applied log since the distribution is skewed
p <- ggplot(data = business, aes(x = review_count)) +
geom_histogram(bins = 50, fill = "darkgreen", binwidth = 25) +
#geom_density(alpha=.2, fill="#FF6666") +
scale_y_log10() +
labs(title ="Distribution of reviews")
ggplotly(p)
Transformation introduced infinite values in continuous y-axis
#ggsave("reviews_dist.png", width = 10, height = 5)
Businesses “closed” received less reviews, and does not depend of rating. The graph below confirms that assumption.
p <- ggplot(business, mapping = aes(stars, fill = is_open)) +
geom_bar(position = "dodge") +
labs( x = "Star rating", y = "Number of businesses", title ="Star rating distribution business open/closed")
ggplotly(p)
#ggsave("stars_dist2.png", width = 5, height = 5)
Same distribution of rating, may be the closure is not related to the rating.
p <- ggplot(business, mapping = aes(neighborhood, fill =is_open)) +
geom_bar(position = "stack") +
coord_flip()
ggplotly(p)
center_long = median(business$longitude, na.rm = TRUE)
center_lat = median(business$latitude, na.rm = TRUE)
leaflet(business) %>%
addTiles()%>%
#addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~ longitude, lat = ~latitude, radius = ~sqrt(review_count)) %>%
setView(lng = center_long, lat = center_lat, zoom = 10)
NA
Leading Business Categories:
Let’s look at the leading business categories in out dataset. A business is linked to multiple categories in our dataset, so we have to do a bit of preprocessing, which is simple using dplyr package.
categorie <- business %>%
unnest(categories = str_split(categories, ",")) %>%
mutate(categories = str_trim(categories,side = "both")) %>%
select(name, categories) %>%
group_by(categories) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
head(25)
p <- ggplot(categorie, aes(x = reorder(categories, n), y = n)) +
geom_col(aes(fill = n)) +
scale_fill_gradientn(colours=RColorBrewer::brewer.pal(11,"Spectral")) +
coord_flip()
ggplotly(p)